import pandas as pd

import sys
args = sys.argv
c = int(args[1])

df = pd.read_csv('../data/gpo_final_data/narratives_complete_with_metadata_{0}_no_frequency_filter.csv'.format(c))

# Get true records
arg1 = df[['ARG1', 'ARG1-RAW']].drop_duplicates().sample(250, random_state=333)
arg1['arg'] = 1
arg1.rename(columns={'ARG1': 'ARG', 'ARG1-RAW': 'ARG-RAW'}, inplace=True)

arg0 = df[['ARGO', 'ARGO-RAW']].drop_duplicates().sample(250, random_state=333)
arg0['arg'] = 0
arg0.rename(columns={'ARGO': 'ARG', 'ARGO-RAW': 'ARG-RAW'}, inplace=True)

df_check = arg1.append(arg0)
df_check['true'] = 1

# Get fake records
arg1 = df[['ARG1', 'ARG1-RAW']].drop_duplicates().sample(250, random_state=444)
arg1['arg'] = 1
arg1.rename(columns={'ARG1': 'ARG', 'ARG1-RAW': 'ARG-RAW'}, inplace=True)
random_arg1 = list(arg1['ARG'].sample(250))
arg1['ARG'] = random_arg1
arg1['true'] = 0

arg0 = df[['ARGO', 'ARGO-RAW']].drop_duplicates().sample(250, random_state=444)
arg0['arg'] = 0
arg0.rename(columns={'ARGO': 'ARG', 'ARGO-RAW': 'ARG-RAW'}, inplace=True)
random_arg0 = list(arg0['ARG'].sample(250))
arg0['ARG'] = random_arg0
arg0['true'] = 0

df_check = df_check.append(arg0)
df_check = df_check.append(arg1)

# With the sampling, simply the order of the obs will change within the file!
df_check = df_check.sample(len(df_check))

df_check['similarity'] = ''

df_check.to_csv('../data/human_validation/cluster_validation_{0}.csv'.format(c), index=False)

df_check[['ARG', 'ARG-RAW']].to_csv('../data/human_validation/cluster_validation_{0}_send_out.csv'.format(c), index=False)
